package crawler

import com.lheia.downloader.MiddlewareDownloader
import com.lheia.process.SpierGZHPProcess
import com.lheia.proxy.FileProxyProvider
import com.lheia.util.RequestExtraConstants
import us.codecraft.webmagic.Request
import us.codecraft.webmagic.Spider


class SpiderJob {
    def resdisScheduler
    def gormPipeline
    static triggers = {
        /**
         * 每周一到周五凌晨两点执行
         */
        cron cronExpression: "58 56 16 ? * *"
    }

    def execute() {
        long time1 = System.currentTimeMillis()
        def downloader = new MiddlewareDownloader()
        downloader.setProxyProvider(FileProxyProvider.from("./proxyes.dat"))
        Spider spider =  Spider.create(new SpierGZHPProcess())
        Request request = new Request("http://www.hp.gov.cn/hp/zdhjbh/zdly_list2.shtml")
        /**
         *使用无头浏览器
         */
        request.putExtra(RequestExtraConstants.PHANTOM_JS,true)
        /**
         * 不计入去重
         */
        request.putExtra(RequestExtraConstants.NO_NEED_TO_REMOVE,true)
        /**
         * 处理回调
         */
        request.putExtra(RequestExtraConstants.CALLBACK,"processList")
        spider.addRequest(request)
        spider.setScheduler(resdisScheduler).addPipeline(gormPipeline).setDownloader(downloader).thread(1).run()
        long time2 = System.currentTimeMillis()
        long interval = time2 - time1
        println("抓取用时" + interval + "ms")
    }
}
